library(ggplot2)
library(heatmaply)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
##
## ======================
## Welcome to heatmaply version 0.16.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
## ======================
library(Rtsne)
data = read.csv('geneExpression_GDSC.csv', row.names = "X")
data
summary(data$GeneralType)
## aero_dig_tract bone breast digestive_system
## 77 35 49 48
## kidney large_intestine leukemia lung
## 33 45 76 22
## lung_NSCLC lung_SCLC lymphoma myeloma
## 108 55 65 17
## nervous_system neuroblastoma pancreas skin
## 55 31 30 53
## soft_tissue thyroid urogenital_system NA's
## 19 16 100 1
summary(data$Type)
## acute_myeloid_leukaemia adrenal_gland
## 24 1
## anaplastic_large_cell_lymphoma B_cell_leukemia
## 3 12
## B_cell_lymphoma biliary_tract
## 31 5
## Bladder bone_other
## 19 2
## breast Burkitt_lymphoma
## 49 13
## cervix chondrosarcoma
## 14 3
## chronic_myeloid_leukaemia digestive_system_other
## 10 1
## endometrium ewings_sarcoma
## 11 21
## fibrosarcoma glioma
## 2 51
## haematopoietic_neoplasm other hairy_cell_leukaemia
## 6 3
## head and neck Hodgkin_lymphoma
## 42 9
## kidney large_intestine
## 32 45
## leukemia liver
## 3 14
## lung_NSCLC_adenocarcinoma lung_NSCLC_carcinoid
## 65 4
## lung_NSCLC_large cell lung_NSCLC_not specified
## 13 11
## lung_NSCLC_squamous_cell_carcinoma Lung_other
## 15 1
## lung_small_cell_carcinoma lymphoblastic_leukemia
## 55 11
## lymphoblastic_T_cell_leukaemia lymphoid_neoplasm other
## 8 10
## medulloblastoma melanoma
## 4 50
## mesothelioma myeloma
## 21 12
## neuroblastoma oesophagus
## 31 35
## osteosarcoma ovary
## 9 41
## pancreas prostate
## 30 7
## rhabdomyosarcoma skin_other
## 9 3
## soft_tissue_other stomach
## 8 28
## T_cell_leukemia testis
## 3 1
## thyroid urogenital_system_other
## 16 4
## uterus NA's
## 3 1
You can find a cheat sheet ;). https://rstudio.com/resources/cheatsheets/
Let’s create a bar chat showing the number of cell lines per tissue type.
ggplot(data, aes(x=GeneralType, fill=GeneralType)) + geom_bar() +
theme(axis.text.x = element_text(angle=90, hjust=1,vjust=1))
# what happens if you remove theme?
Let’s try another plot that takes two variables.
ggplot(data, aes(x=GeneralType, y=JUN, fill=GeneralType)) + geom_boxplot() + #geom_point() +
theme(axis.text.x = element_text(angle=90, hjust=1,vjust=1))
# you can also try other gene?
set.seed(1000) # to make sampling reproducible
filter_cell <- data$GeneralType %in% c('breast', 'lung')
filter_gene <- c('Type', 'GeneralType', sample(colnames(data),50))
heatmaply(data[filter_cell, filter_gene], column_text_angle = 90,
hclust_method = "average") %>% layout(width=500, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
## Warning: 'heatmap' objects don't have these attributes: 'showlegend'
## Valid attributes include:
## 'type', 'visible', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'z', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'transpose', 'xtype', 'ytype', 'zsmooth', 'connectgaps', 'xgap', 'ygap', 'zhoverformat', 'hovertemplate', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
filter_cell <- data$GeneralType %in% c('breast', 'lung', 'pancreas')
PCA = prcomp(data[filter_cell, 3:ncol(data)]) # selecting all numeric columns
barplot(PCA$sdev[1:10]^2, las=2) # amount of variance captured by first 10 components
PCA$x[1:10,1:5]
## PC1 PC2 PC3 PC4 PC5
## AU565 -13.724367 -1.871347 -5.3455675 1.9187103 -2.8918538
## BT-20 -5.445793 -5.249889 0.9582700 -1.8191926 -2.2218144
## BT-474 -15.100586 1.672376 -4.5099641 -1.5250846 2.0464121
## BT-483 -17.661129 2.925127 0.4103088 0.5627541 0.5346582
## BT-549 7.192831 12.845869 1.8666512 -3.2586385 -3.1331359
## CAL-120 7.303171 6.230816 2.3337983 1.5738670 4.7528825
## CAL-148 -16.637455 5.462866 -0.1385591 3.0094828 1.4583948
## CAL-51 -0.162678 3.928606 6.1819198 0.4521046 0.6904870
## CAL-85-1 10.393089 -6.561907 -1.3468198 -10.0114684 -4.4460762
## CAMA-1 -17.180801 2.735580 -5.9790370 -0.6497663 1.3056740
df <- as.data.frame(PCA$x)
df$Type = data$GeneralType[filter_cell]
ggplot(df, aes(x=PC1, y=PC2, col=Type)) + geom_point()
heatmaply(df[,c("PC1", "PC2", "Type")], column_text_angle = 90,
hclust_method = "average") %>% layout(width=500, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
## Warning: 'heatmap' objects don't have these attributes: 'showlegend'
## Valid attributes include:
## 'type', 'visible', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'z', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'transpose', 'xtype', 'ytype', 'zsmooth', 'connectgaps', 'xgap', 'ygap', 'zhoverformat', 'hovertemplate', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
Unlike PCA, t-sne takes local associations than global associations. (only the closest neighbors considered) - perplexity parameter controls the number of closest neighbor considered.
Also, t-sne is not reproducible while PCA is. However, the algorithm gives visually pleasing outcome.
filter_cell <- data$GeneralType %in% c('breast', 'lung', 'pancreas')
tsne = Rtsne(data[filter_cell, 3:ncol(data)], dims=2, perplexity = 30, max_iter=5000
) # selecting all numeric columns
df <- as.data.frame(tsne$Y)
df$Type = data$GeneralType[filter_cell]
ggplot(df, aes(x=V1, y=V2, col=Type)) + geom_point()
heatmaply(df, column_text_angle = 90,
hclust_method = "average") %>% layout(width=500, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
## Warning: 'heatmap' objects don't have these attributes: 'showlegend'
## Valid attributes include:
## 'type', 'visible', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'z', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'transpose', 'xtype', 'ytype', 'zsmooth', 'connectgaps', 'xgap', 'ygap', 'zhoverformat', 'hovertemplate', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'